
## A Research Note on the Prevalence
## of Housing Eviction among Children
## Born in American Cities

## CODE PART C:
## Translates model output into results
## presented in the paper and supplement

## Ian Lundberg and Louis Donnelly

## Department of Sociology, Office of Population Research,
## and Center for Research on Child Wellbeing
## Princeton University

## Code by Ian Lundberg (ilundberg@princeton.edu)
## Last updated 10 May 2018

## Set working directory
setwd("C:/Users/iandl/Documents/FF Eviction Prevalence")

## Load files from prior code chunks
load("Intermediate/d.Rdata")
load("Intermediate/IDs.Rdata")
load("Intermediate/amelia.out.Rdata")
load("Intermediate/prevalenceByWave_samples.Rdata")
load("Intermediate/Amelia.untransformed.Rdata")
load("Intermediate/imputed.cumulative.estimates.Rdata")
load("Intermediate/imputed.retro.Rdata")
load("Intermediate/lowerBound_and_MI.Rdata")
load("Intermediate/scale.factors.Rdata")
load("Intermediate/newData.Rdata")
load("Intermediate/specification_numbers.Rdata")

## Load required packages
library(tidyverse)
library(reshape2)
library(Amelia)
library(doParallel)
library(doRNG)
library(xtable)

###########################
## TABLE 2. DESCRIPTIVES ##
###########################

## Sink these descriptives to a text file
sink("Output/desc.txt", append = FALSE, split = FALSE)
print("Descriptives")
## A function to return descriptive statistics
get.desc <- function(type, name) {
  if (type == "disc") {
    estimates <- foreach(i = 1:n.imps, .combine = "rbind") %do% {
      amelia.out$imputations[[i]] %>%
        select(matches(name), m1natwt) %>%
        rename_at(1, .funs = function(x) "variable") %>%
        group_by(variable) %>%
        summarize(num = n(),
                  weight = sum(m1natwt, na.rm = T)) %>%
        group_by() %>%
        transmute(imputation = i,
                  variable = variable,
                  unweighted = num / sum(num),
                  weighted = weight / sum(weight))
    } %>%
      group_by(variable) %>%
      summarize_all(.funs = mean) %>%
      select(variable, unweighted, weighted) %>%
      mutate(prop.missing = mean(is.na(d[,name])))
  }
  else if (type == "cts") {
    estimates <- foreach(i = 1:n.imps, .combine = "rbind") %do% {
      amelia.out$imputations[[i]] %>%
        select(matches(name), m1natwt) %>%
        rename_at(1, .funs = function(x) "value") %>%
        mutate(imputation = i,
               unweighted.mean = mean(value),
               unweighted.sd = sd(value)) %>%
        filter(!is.na(m1natwt)) %>%
        mutate(weighted.mean = sum(m1natwt*value) / sum(m1natwt),
               weighted.sd = sqrt(sum(m1natwt*(value - weighted.mean) ^ 2) /
                                    sum(m1natwt)),
               index = 1:n() == 1) %>%
        filter(index == 1) %>%
        mutate(variable = name) %>%
        select(variable, imputation,
               unweighted.mean, unweighted.sd, 
               weighted.mean, weighted.sd)
    } %>%
      group_by(variable) %>%
      summarize_all(.funs = mean) %>%
      select(variable, unweighted.mean, unweighted.sd,
             weighted.mean, weighted.sd) %>%
      mutate(prop.missing = mean(is.na(d[,name])))
  }
  return(estimates)
}
foreach(var = c("cm1ethrace","cm1edu"), .combine = "rbind") %do% {
  print(get.desc("disc",name = var))
}
foreach(var = c("married","cm1age","impulsivity","cm3cogsc",
                "prop.white","prop.black","prop.hisp",
                "prop.all.other","prop.below.povline",
                "med.rent.over.income", "m1imm","cmf2fevjail"),
        .combine = "rbind") %do% {
          print(get.desc("cts",name = var))
        }
## Income is different since averaged over waves,
## so proportion missing is more complicated
print("Proportion in each permanent income category")
estimates <- foreach(i = 1:n.imps, .combine = "rbind") %do% {
  amelia.out$imputations[[i]] %>%
    select(income_mean_cat, m1natwt) %>%
    rename_at(1, .funs = function(x) "variable") %>%
    group_by(variable) %>%
    summarize(num = n(),
              weight = sum(m1natwt, na.rm = T)) %>%
    group_by() %>%
    transmute(imputation = i,
              variable = variable,
              unweighted = num / sum(num),
              weighted = weight / sum(weight))
} %>%
  group_by(variable) %>%
  summarize_all(.funs = mean) %>%
  select(variable, unweighted, weighted)
print(data.frame(estimates))
print("Prop. missing income each wave")
print(round(apply(d[,paste0("inc",2:6)], 2, function(x) mean(is.na(x))),3))

## Housing status is also different since averaged over waves,
## so proportion missing is more complicated
print("Mean prop. years in owned home")
estimates <- foreach(i = 1:n.imps, .combine = "rbind") %do% {
  amelia.out$imputations[[i]] %>%
    select(prop_own, m1natwt) %>%
    rename_at(1, .funs = function(x) "value") %>%
    mutate(imputation = i,
           unweighted.mean = mean(value),
           unweighted.sd = sd(value)) %>%
    filter(!is.na(m1natwt)) %>%
    mutate(weighted.mean = sum(m1natwt*value) / sum(m1natwt),
           weighted.sd = sqrt(sum(m1natwt*(value - weighted.mean) ^ 2) /
                                sum(m1natwt)),
           index = 1:n() == 1) %>%
    filter(index == 1) %>%
    mutate(variable = "prop_own") %>%
    select(variable, imputation,
           unweighted.mean, unweighted.sd, 
           weighted.mean, weighted.sd)
} %>%
  group_by(variable) %>%
  summarize_all(.funs = mean) %>%
  select(variable, unweighted.mean, unweighted.sd,
         weighted.mean, weighted.sd)
print(data.frame(estimates))
print("Prop. missing ownership each wave")
print(round(apply(d[,paste0("own",2:6)], 2, function(x) mean(is.na(x))),3))

## Rent burden is similar
print("Mean rent burden")
estimates <- foreach(i = 1:n.imps, .combine = "rbind") %do% {
  amelia.out$imputations[[i]] %>%
    select(rent_burden, m1natwt) %>%
    rename_at(1, .funs = function(x) "value") %>%
    mutate(imputation = i,
           unweighted.mean = mean(value),
           unweighted.sd = sd(value)) %>%
    filter(!is.na(m1natwt)) %>%
    mutate(weighted.mean = sum(m1natwt*value) / sum(m1natwt),
           weighted.sd = sqrt(sum(m1natwt*(value - weighted.mean) ^ 2) /
                                sum(m1natwt)),
           index = 1:n() == 1) %>%
    filter(index == 1) %>%
    mutate(variable = "rent_burden") %>%
    select(variable, imputation,
           unweighted.mean, unweighted.sd, 
           weighted.mean, weighted.sd)
} %>%
  group_by(variable) %>%
  summarize_all(.funs = mean) %>%
  select(variable, unweighted.mean, unweighted.sd,
         weighted.mean, weighted.sd)
print(data.frame(estimates))
print("Prop. missing rent burden each wave")
print(round(apply(d[,paste0("rent_burden",2:6)], 2, function(x) mean(is.na(x))),3))
sink()

##################################
## FIGURE 1. Prevalence by wave ##
##################################

prevalenceByWave_samples %>%
  melt(id = NULL,
       variable.name = "wave") %>%
  group_by(wave) %>%
  summarize(mean = mean(value),
            min = quantile(value, .025),
            max = quantile(value, .975)) %>%
  ggplot(aes(x = wave, y = mean,
             label = paste(formatC(100*mean, digits=1, format = "f"),"%"),
             ymin = min, ymax = max)) +
  geom_errorbar() +
  geom_label() + 
  ylab("Evicted in\npast 12 months") +
  xlab("Survey wave") +
  scale_x_discrete(
    labels = c(
      "Wave 1\nBirth - Age 1\n1999 - 2000",
      "Wave 2\nAge 2 - 3\n2001 - 2002",
      "Wave 3\nAge 4 - 5\n2004 - 2005",
      "Wave 4\nAge 8 - 9\n2008 - 2009",
      "Wave 5\nAge 14 - 15\n2015 - 2016"
    )
  ) +
  ggsave("Output/evByWave_weightedOnly.pdf",
         height = 2.5, width = 5)

#################################
## Prepare for modeled figures ##
#################################

load("Intermediate/stanPred.Rdata")
stanPred_short <- stanPred %>%
  filter(model == max(model) & re == T) %>%
  select(-model, -re) %>%
  left_join(IDs %>%
              select(idnum, person),
            by = "person")
rm(stanPred)

cl <- makeCluster(15)
registerDoParallel(cl)
model_estimates <- foreach(
  drawName = colnames(stanPred_short)[grepl("draw",colnames(stanPred_short))],
  .packages = c("tidyverse"),
  .combine = "rbind"
) %dopar% {
  # Identify imputation number
  imp <- as.numeric(gsub("imp|_.*","",drawName))
  draw <- as.numeric(gsub(".*draw","",drawName))

  # Create a data frame with the covariates and this draw
  withCovs <- data.frame(
    idnum = stanPred_short$idnum,
    value = stanPred_short[,drawName]
  ) %>%
    left_join(
      amelia.out$imputations[[imp]] %>%
        select(idnum, m1natwt, m1citywt,
               cm1ethrace,
               income_mean_cat, m1city),
      by = "idnum"
    )
  
  # Return the estimates for all groups
  bind_rows(
    withCovs %>%
      filter(!is.na(m1natwt)) %>%
      summarize(group = "overall.Overall",
                estimate = weighted.mean(value, w = m1natwt)),
    withCovs %>%
      filter(!is.na(m1natwt)) %>%
      mutate(group = paste0("race.",gsub(" ","_",cm1ethrace))) %>%
      group_by(group) %>%
      summarize(estimate = weighted.mean(value, w = m1natwt)),
    withCovs %>%
      filter(!is.na(m1natwt)) %>%
      mutate(group = paste0("income.",gsub(" ","_",income_mean_cat))) %>%
      group_by(group) %>%
      summarize(estimate = weighted.mean(value, w = m1natwt)),
    withCovs %>%
      filter(!is.na(m1citywt)) %>%
      mutate(group = paste0("city.",gsub(" ","_",gsub("[[:digit:]]+ ","",m1city)))) %>%
      group_by(group) %>%
      summarize(estimate = weighted.mean(value, w = m1citywt))
  ) %>%
    mutate(imputation = imp,
           draw = draw)
}
stopCluster(cl)
  
## Summarize prevalence estimates
estimates <- model_estimates %>%
  group_by(group) %>%
  summarize(Mean = mean(estimate),
            Min = quantile(estimate, .025),
            Max = quantile(estimate, .975)) %>%
  separate(group, into = c("bigGroup","group"),
           sep = "\\.") %>%
  mutate(group = case_when(group == "Other" ~ "White/other",
                           TRUE ~ group),
         group = str_replace_all(group,"_"," "),
         bigGroup = case_when(
           bigGroup == "income" ~ "By permanent income / poverty line",
           bigGroup == "race" ~ "By race",
           TRUE ~ bigGroup
         ))

#########################################
## Figure 2. Plot of data availability ##
#########################################

data.frame(x = c(0,1,2,3,4,5,8,9,14,15,9,15),
           y = c(1,1,1,1,1,1,1,1,1,1,0,0),
           wave = rep(c("w1","w2","w3","w4","w5","w5retro"),
                      each = 2)) %>%
  ggplot(aes(x = x, y = y, linetype = wave)) +
  geom_point() +
  geom_line() +
  annotate(geom = "text", color = "black",
           x = 0, y = 0.5, hjust = 0,
           label = "Solid lines indicate 12-month reporting periods",
           size = 2.5) +
  annotate(geom = "text", color = "black",
           x = 12, y = -0.4,
           label = "Long-term recall: May understate prevalence",
           size = 2.5) +
  scale_linetype_manual(values = c(rep("solid",5),"F1")) +
  scale_y_continuous(breaks = NULL, limits = c(-.8,1.2),
                     name = "") +
  scale_x_continuous(breaks = c(0,1,2,3,4,5,8,9,14,15),
                     name = "Child age") +
  ggtitle("A. Child ages covered by eviction reports in the Fragile Families Study") +
  theme(legend.position = "none",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(color = "black"),
        plot.title = element_text(size = 11)) +
  ggsave("Output/EvictionData.pdf",
         height = 1.5, width = 6)

data.frame(x = c(0,1,2,3,4,5,8,9,9,15,
                 0,1,2,3,4,5,8,9,9,15,
                 0,1,2,3,4,5,8,9,14,15,
                 0,15),
           y = c(rep(c(4.4,2.7,0.8),each = 10),0.4,0.4),
           wave = factor(c(rep(1:5,each = 2),
                           rep(6:10,each = 2),
                           rep(11:15,each = 2),
                           rep(16,2))),
           coverage = factor(c(rep(2,10),
                               rep(1,8),rep(3,2),
                               rep(1,12)))) %>%
  ggplot(aes(x = x, y = y, linetype = coverage,
             color = wave)) +
  geom_point() +
  geom_line() +
  scale_linetype_manual(values = c("solid","twodash","F1")) +
  scale_color_manual(values = c(rep("red",5),
                                rep("blue",5),
                                rep("seagreen4",6))) +
  annotate(geom = "text", color = "red",
           x = 0, y = 4.6,
           label = "Absolute lower bound",
           size = 3, hjust = 0, fontface = "bold") +
  annotate(geom = "text", color = "red",
           x = 15, y = 4.1,
           label = "Dot-dash indicates that some cases are missing in each window\nMissing cases are assumed to be not evicted",
           size = 2.5, hjust = 1) +
  annotate(geom = "text", x = 7.5, y = 3.6, size = 3,
           label = "Impute survey non-response",
           fontface = "bold",
           color = "blue") +
  annotate(geom = "segment", x = 7.5, y = 3.4, size = 1.2,
           xend = 7.5, yend = 3.1,
           arrow = arrow(length = unit(.1, "inches"), angle = 40),
           color = "blue") +
  annotate(geom = "text", x = 7.5, y = 1.9, size = 3,
           label = "Infer prevalence over years when no survey was conducted",
           fontface = "bold",
           color = "seagreen4") +
  annotate(geom = "segment", x = 7.5, y = 1.7, size = 1.2,
           xend = 7.5, yend = 1.4,
           arrow = arrow(length = unit(.1, "inches"), angle = 40),
           color = "seagreen4") + 
  annotate(geom = "text", color = "red",
           x = 0, y = 4.2,
           label = paste0("Estimate 1: ",
                          format(100*lowerBound_and_MI$estimate[lowerBound_and_MI$estimand == "LowerBound"],
                                 digits = 2, format = "f")," % (CI: ",
                          format(100*lowerBound_and_MI$CI.min[lowerBound_and_MI$estimand == "LowerBound"],
                                 digits = 2, format = "f"),"\u2013",
                          format(100*lowerBound_and_MI$CI.max[lowerBound_and_MI$estimand == "LowerBound"],
                                 digits = 2, format = "f")," %)"),
           size = 3, hjust = 0, fontface = "bold") +
  annotate(geom = "text", color = "blue",
           x = 0, y = 2.9,
           label = "Multiple imputation",
           size = 3, hjust = 0, fontface = "bold") +
  annotate(geom = "text", color = "blue",
           x = 12, y = 2.5,
           label = "Long-term recall: May understate prevalence",
           size = 2.5) +
  annotate(geom = "text", color = "blue",
           x = 0, y = 2.5,
           label = paste0("Estimate 2: ",
                          format(100*lowerBound_and_MI$estimate[lowerBound_and_MI$estimand == "MI"],
                                 digits = 2, format = "f")," % (CI: ",
                          format(100*lowerBound_and_MI$CI.min[lowerBound_and_MI$estimand == "MI"],
                                 digits = 2, format = "f"),"\u2013",
                          format(100*lowerBound_and_MI$CI.max[lowerBound_and_MI$estimand == "MI"],
                                 digits = 3, format = "f")," %)"),
           size = 3, hjust = 0, fontface = "bold") +
  annotate(geom = "text", color = "seagreen4",
           x = 0, y = 1.2,
           label = "Multilevel logistic regression",
           size = 3, hjust = 0, fontface = "bold") +
  annotate(geom = "text", color = "seagreen4",
           x = 0, y = 1,
           label = "Model fit on five annual reports",
           size = 2.5, hjust = 0) +
  annotate(geom = "text", color = "seagreen4",
           x = 0, y = 0.6,
           label = "15-year prevalence inferred from model parameters",
           size = 2.5, hjust = 0) +
  annotate(geom = "text", color = "seagreen4",
           x = 0, y = 0,
           label = paste0("Estimate 3: Overall 15-year prevalence = ",
                          format(100*estimates$Mean[estimates$group == "Overall"],
                                 digits = 3, format = "f")," % (CI: ",
                          format(100*estimates$Min[estimates$group == "Overall"],
                                 digits = 3, format = "f"),"\u2013",
                          format(100*estimates$Max[estimates$group == "Overall"],
                                 digits = 3, format = "f")," %)"),
           size = 3, hjust = 0, fontface = "bold") +
  annotate(geom = "rect", color = "seagreen4", linetype = "longdash",
           alpha = 0,
           xmin = -.2, xmax = 11.7, ymin = -.2, ymax = 0.15) +
  scale_y_continuous(breaks = NULL, limits = c(-.2,4.6),
                     name = "") +
  scale_x_continuous(breaks = c(0,1,2,3,4,5,8,9,14,15),
                     name = "Child age") +
  ggtitle("B. Three estimates of the proportion ever evicted by age 15") +
  theme(legend.position = "none",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        axis.line = element_line(color = "black"),
        plot.title = element_text(size = 11)) +
  ggsave("Output/EvictionModels.pdf",
         height = 5, width = 6)

########################################
## Figure 3. Prevalence by subgroups ##
########################################

estimates %>%
  filter(grepl("income|race",bigGroup)) %>%
  ## Re-order groups to show up in the right order in the plot
  mutate(group = gsub(" percent","%",group),
         group = factor(group),
         group = relevel(group, ref = "Black"),
         group = relevel(group, ref = "Hispanic"),
         group = relevel(group, ref = "White/other"),
         group = relevel(group, ref = "Below 50% of poverty line"),
         group = relevel(group, ref = "50-100% of poverty line"),
         group = relevel(group, ref = "100-200% of poverty line"),
         group = relevel(group, ref = "200-300% of poverty line"),
         group = relevel(group, ref = "Above 300% of poverty line"),
         bigGroup = factor(bigGroup)) %>%
  ggplot(aes(x = group,
             y = Mean,
             ymin = Min,
             ymax = Max,
             label = paste(formatC(100*Mean, digits = 1, format = "f"),"%"))) +
  geom_errorbar(position = position_dodge(width = .8),
                width = .5) +
  geom_label(position = position_dodge(width = .8),
             label.padding = unit(0.2, "lines"),
             size = 3,
             show.legend = F) +
  ylab("Ever evicted by age 15\n(posterior mean and 95% credible interval)") +
  facet_wrap(~bigGroup, scales = "free", ncol = 1) +
  coord_flip() +
  ylim(c(0,.4)) +
  xlab("") +
  ggsave("Output/RaceIncomeGroups.pdf",
         height = 5, width = 6)

########################################
## Figure A5 and A6. City differences ##
########################################

estimates %>%
  filter(grepl("city",bigGroup)) %>%
  mutate(group = str_replace(group,"_"," "),
         group = fct_reorder(group,Mean)) %>%
  ggplot(aes(x = group,
             y = Mean,
             ymin = Min,
             ymax = Max,
             label = paste(formatC(100*Mean, digits = 1, format = "f"),"%"))) +
  geom_errorbar() +
  geom_label(label.padding = unit(0.2, "lines"), size = 2.5) +
  xlab("City of birth") +
  ylab("Ever evicted by age 15\n(posterior mean and 95% credible interval)") +
  coord_flip() +
  ylim(c(0,.4)) +
  ggsave("Output/ByCity.pdf",
         height = 5, width = 8)

## City differences from national estimate
cityDiff <- model_estimates %>%
  filter(grepl("city",group)) %>%
  left_join(
    model_estimates %>%
      filter(group == "overall.Overall") %>%
      transmute(overall = estimate,
                imputation = imputation,
                draw = draw),
    by = c("imputation", "draw")
  ) %>%
  mutate(difference = estimate - overall) %>%
  group_by(group) %>%
  summarize(Mean = mean(difference),
            Min = quantile(difference, .025),
            Max = quantile(difference, .975)) %>%
  group_by() %>%
  mutate(group = str_replace(group, "city.",""),
         group = str_replace(group,"_"," "),
         group = fct_reorder(group, Mean))

cityDiff %>%
  ggplot(aes(x = group,
             y = Mean,
             ymin = Min,
             ymax = Max,
             label = paste0(ifelse(Mean < 0, "", "+"),
                            formatC(100*Mean, digits = 1, format = "f")," %"))) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  geom_errorbar() +
  geom_label(label.padding = unit(0.2, "lines"), size = 2.5) +
  xlab("City of birth") +
  ylab("Ever evicted by age 15: City deviations from national prevalence\n(posterior mean and 95% credible interval for difference)") +
  coord_flip() +
  annotate(geom = "label", x = 19.5, y = -0.05,
           label = paste("National estimate =",
                         formatC(100*estimates$Mean[estimates$group == "Overall"], digits = 1, format = "f"),
                         "%"),
           size = 3) +
  annotate(geom = "segment", x = 19.5, y = -0.02,
           xend = 19.5, yend = -0.003,
           arrow = arrow(length = unit(0.03, "npc"))) +
  ggsave("Output/CityDiff.pdf",
         height = 5, width = 8)

## Test city differences
## Get a one-sided p-value for being on the observed
## side of the national estimate
bhNat <- cityDiff <- model_estimates %>%
  filter(grepl("city",group)) %>%
  left_join(
    model_estimates %>%
      filter(group == "overall.Overall") %>%
      transmute(overall = estimate,
                imputation = imputation,
                draw = draw),
    by = c("imputation", "draw")
  ) %>%
  mutate(difference = estimate - overall) %>%
  group_by(group) %>%
  summarize(above = mean(difference > 0),
            below = mean(difference < 0)) %>%
  melt(id = "group") %>%
  arrange(value) %>%
  mutate(required = (1:40) / 40 * .05) %>%
  mutate(reject = value < required) %>%
  transmute(City = group,
            Null = ifelse(variable == "above",
                          "City has above-average prevalence",
                          "City has below-average prevalence"),
            Alternative = ifelse(variable == "below",
                                 "City has above-average prevalence",
                                 "City has below-average prevalence"),
            p = value,
            `Required BH threshold` = required,
            Decision = ifelse(value < required,
                              "Reject the null",
                              "Fail to reject"))
sink("Output/benjaminiHochberg.txt", append = FALSE, split = FALSE)
print(xtable(bhNat, digits = 4, caption = "Benjamini-Hochberg correction to tests for city differences in eviction prevalence from national estimate"),
      include.rownames = F,
      size = "scriptsize")
sink()

#####################################################
## Figure A7. Prevalence implied by various models ##
#####################################################


load("Intermediate/stanPred.Rdata")

stanPred %>%
  left_join(IDs %>%
              select(person, idnum),
            by = "person") %>%
  left_join(d %>%
              select(idnum, m1natwt),
            by = "idnum") %>%
  select(-person,-idnum) %>%
  filter(!is.na(m1natwt)) %>%
  melt(id = c("re","model","m1natwt"),
       variable.name = "draw") %>%
  group_by(re,model, draw) %>%
  summarize(prevalence.draws = weighted.mean(value, w = m1natwt)) %>%
  group_by(re,model) %>%
  summarize(prevalence = mean(prevalence.draws),
            ci.min = quantile(prevalence.draws, .025),
            ci.max = quantile(prevalence.draws, .975)) %>%
  mutate(model_withLabel = factor(model, labels = c(
    "(1)\nNo covariates",
    "(2)\n+ demographics",
    "(3)\n+ age and\nperiod",
    "(4)\n+ income and\nhousing\ncharacteristics",
    "(5)\n+ parental\nrisk\nfactors",
    "(6)\n+ neighborhood\ncharacteristics"
  ))) %>%
  group_by() %>%
  mutate(re = factor(re, labels = c(
    "Logistic regression",
    "+ random intercepts (city and child)"
  ))) %>%
  ggplot(aes(x = model_withLabel, y = prevalence,
             ymin = ci.min, ymax = ci.max,
             label = format(prevalence,
                            format = "f",
                            digits = 3))) +
  geom_errorbar() +
  geom_label() +
  theme(axis.text.x = element_text(size = 8)) +
  facet_wrap(~re, ncol = 1) +
  scale_x_discrete(name = element_blank()) +
  ylab("Ever evicted by age 15\n(posterior mean and 95% credible interval)") +
  ggsave("Output/model_comparison_prevalence.pdf",
         height = 4.5, width = 6.5)
rm(stanPred)

#########################################
## Figure A8. MSE under various models ##
#########################################

load("Intermediate/cvPred.Rdata")

# Overall CV results
cvPred %>%
  filter(!is.na(weight)) %>%
  select(-person) %>%
  melt(id = c("re","model","fold","weight","y"),
       variable.name = "draw",
       value.name = "phat") %>%
  group_by(re,model,fold,draw) %>%
  summarize(mse = weighted.mean((y - phat) ^ 2, w = weight)) %>%
  group_by(re, model) %>%
  summarize(mse.mean = mean(mse),
            mse.min = quantile(mse, .025),
            mse.max = quantile(mse, .975)) %>%
  group_by() %>%
  mutate(model_withLabel = factor(model, labels = c(
    "(1)\nNo covariates",
    "(2)\n+ demographics",
    "(3)\n+ age and\nperiod",
    "(4)\n+ income and\nhousing\ncharacteristics",
    "(5)\n+ parental\nrisk\nfactors",
    "(6)\n+ neighborhood\ncharacteristics"
  ))) %>%
  mutate(re = factor(re, labels = c(
    "Logistic regression",
    "+ random intercepts (city and child)"
  ))) %>%
  ggplot(aes(x = model_withLabel, y = mse.mean,
             ymin = mse.min, ymax = mse.max,
             label = format(mse.mean,
                            format = "f",
                            digits = 3))) +
  geom_label() +
  theme(axis.text.x = element_text(size = 8)) +
  facet_wrap(~re, ncol = 1) +
  scale_x_discrete(name = element_blank()) +
  ylab("Mean squared error\npredicting annual eviction") +
  ggsave("Output/model_comparison_cv.pdf",
         height = 4.5, width = 6.5)

## CV results by fold
cvPred %>%
  filter(!is.na(weight)) %>%
  select(-person) %>%
  melt(id = c("re","model","fold","weight","y"),
       variable.name = "draw",
       value.name = "phat") %>%
  group_by(re,model,fold,draw) %>%
  summarize(mse = weighted.mean((y - phat) ^ 2, w = weight)) %>%
  group_by(re, model, fold) %>%
  summarize(mse.mean = mean(mse),
            mse.min = quantile(mse, .025),
            mse.max = quantile(mse, .975)) %>%
  group_by() %>%
  mutate(model_withLabel = factor(model, labels = c(
    "(1)\nNo covariates",
    "(2)\n+ demographics",
    "(3)\n+ age and\nperiod",
    "(4)\n+ income and\nhousing\ncharacteristics",
    "(5)\n+ parental\nrisk\nfactors",
    "(6)\n+ neighborhood\ncharacteristics"
  ))) %>%
  mutate(re = factor(re, labels = c(
    "Logistic regression",
    "+ random intercepts (city and child)"
  ))) %>%
  mutate(fold = paste("Fold",fold)) %>%
  ggplot(aes(x = model_withLabel, y = mse.mean,
             ymin = mse.min, ymax = mse.max,
             label = format(mse.mean,
                            format = "f",
                            digits = 3))) +
  geom_errorbar() +
  geom_label(size = 1.8) +
  theme(axis.text.x = element_text(size = 8)) +
  facet_wrap(~re + fold, ncol = 2, dir = "v",
             # Credit to Roland at https://stackoverflow.com/questions/41311810/how-to-reduce-vertical-spacing-between-facet-labels-when-using-facet-wrap
             # for this labeller function
             labeller = function (labels) {
               labels <- lapply(labels, as.character)
               list(do.call(paste, c(labels, list(sep = "\n"))))
             }) +
  scale_x_discrete(name = element_blank()) +
  theme(axis.text.x = element_text(size = 6)) +
  ylab("Mean squared error predicting annual eviction\n(posterior mean and 95% credible interval)") +
  ggsave("Output/model_comparison_cvByFold.pdf",
         height = 8.5, width = 8.5)

## Specific numbers for Supplement Part 4 text
# Overall CV results
sink("Output/supplement_part4.txt", append = F, split = F)
print("Cross-validation results for preferred multilevel logit model")
print("Unweighted estimates:")
print(
  cvPred %>%
    group_by() %>%
    filter(model == max(model) & re == T) %>%
    select(-person, -model, -re) %>%
    melt(id = c("fold","weight","y"),
         variable.name = "draw",
         value.name = "phat") %>%
    summarize(mse.unweighted = mean((y - phat) ^ 2),
              bias.unweighted = mean(y - phat),
              predicted.unweighted = mean(phat),
              observed.unweighted = mean(y)) %>%
    melt(id = NULL)
)
print("Weighted estimates:")
print(
  cvPred %>%
    group_by() %>%
    filter(model == max(model) & re == T & !is.na(weight)) %>%
    select(-person, -model, -re) %>%
    melt(id = c("fold","weight","y"),
         variable.name = "draw",
         value.name = "phat") %>%
    summarize(mse = weighted.mean((y - phat) ^ 2, w = weight),
              bias = weighted.mean(y - phat, w = weight),
              predicted = weighted.mean(phat, w = weight),
              observed = weighted.mean(y, w = weight)) %>%
    melt(id = NULL)
)
sink()

rm(cvPred)

#######################
## Appendix Table A1 ##
#######################

table_a1 <- data.frame(
  wave = c(1:5, "Retrospective"),
  valid = apply(d[,c(paste0("valid",2:6),"valid6long")],2,mean),
  complete = (d %>%
                filter(!is.na(ev2) & !is.na(ev3) & !is.na(ev4) &
                         !is.na(ev5) & !is.na(ev6) & !is.na(ev6long)) %>%
                select(idnum,ev2,ev3,ev4,ev5,ev6,ev6long) %>%
                melt(id = "idnum") %>%
                group_by(variable) %>%
                summarize(prop.evicted = mean(value)))$prop.evicted,
  available = (d %>%
                 select(idnum,ev2,ev3,ev4,ev5,ev6,ev6long) %>%
                 melt(id = "idnum") %>%
                 group_by(variable) %>%
                 summarize(prop.evicted = mean(value, na.rm = T)))$prop.evicted,
  imputed = c(
    rowMeans(sapply(amelia.out$imputations, function(imp) {
      colMeans(imp[,paste0("ev.imp",2:6)])
    })),
    mean(sapply(imputed.retro$imputations, function(imp) {
      mean(imp$ev6long)
    }))
  ),
  imputed.weighted = rowMeans(sapply(1:n.imps, function(i) {
    c(ev2 = weighted.mean(amelia.out$imputations[[i]]$ev.imp2[!is.na(d$m1natwt)],
                          w = na.omit(d$m1natwt)),
      ev3 = weighted.mean(amelia.out$imputations[[i]]$ev.imp3[!is.na(d$m1natwt)],
                          w = na.omit(d$m1natwt)),
      ev4 = weighted.mean(amelia.out$imputations[[i]]$ev.imp4[!is.na(d$m1natwt)],
                          w = na.omit(d$m1natwt)),
      ev5 = weighted.mean(amelia.out$imputations[[i]]$ev.imp5[!is.na(d$m1natwt)],
                          w = na.omit(d$m1natwt)),
      ev6 = weighted.mean(amelia.out$imputations[[i]]$ev.imp6[!is.na(d$m1natwt)],
                          w = na.omit(d$m1natwt)),
      ev6long = weighted.mean(imputed.retro$imputations[[i]]$ev6long[!is.na(d$m1natwt)],
                              w = na.omit(d$m1natwt)))
  })),
  complete.n = nrow(d %>%
                      filter(!is.na(ev2) & !is.na(ev3) & !is.na(ev4) &
                               !is.na(ev5) & !is.na(ev6) & !is.na(ev6long))),
  imputed.n = nrow(d),
  imputed.weighted.n = sum(!is.na(d$m1natwt))
)
write_csv(table_a1, path = "Output/table_a1.csv")

###############################
## Table A2. Three estimates ##
###############################
write_csv(
  bind_rows(
    lowerBound_and_MI,
    estimates %>%
      filter(group == "Overall") %>%
      transmute(estimand = "Modeled",
                estimate = Mean,
                CI.min = Min,
                CI.max = Max)
  ),
  path = "Output/table_a2.csv"
)

## Note: We create Table A3, which summarizes the
## coefficients from the model, after we create
## the trace plots.

#########################################
## Figure A1 and A2                    ##
## Diagnostics for multiple imputation ##
#########################################

cl <- makeCluster(10)
registerDoParallel(cl)
over_cts <- foreach(
  var = c(paste0("inc",2:6),
          paste0("rent_burden",2:6),
          "cm1age",paste0("age",2:6),paste0("date",2:6),
          "impulsivity","cm3cogsc","prop.hisp","prop.black",
          "prop.all.other","prop.below.povline","med.rent.over.income"),
  .combine = "rbind", .packages = c("Amelia","tidyverse")
) %dopar% {
  return(data.frame(overimpute(amelia.untransformed, var = var)) %>%
           transmute(variable = var,
                     Imputed = mean.overimputed,
                     Observed = orig,
                     num = n()))
}
over_bin <- foreach(
  var = c(paste0("ev",2:6),
          paste0("own",2:6),
          "married","m1imm",
          "cmf2fevjail"),
  .combine = "rbind", .packages = c("Amelia","tidyverse")
) %dopar% {
  return(data.frame(overimpute(amelia.untransformed, var = var)) %>%
           summarize(variable = var,
                     Imputed = mean(mean.overimputed),
                     Observed = mean(orig),
                     num = n()))
}
stopCluster(cl)

## We do not overimpute the nominal or ordinal variables which are more complicated
## and have very low missingness (city of birth, mother's race, and mother's education)

over_cts %>%
  melt(id = c("variable","num"),
       variable.name = "type") %>%
  mutate(group = factor(case_when(grepl("^age",variable) ~ 1,
                                  grepl("date",variable) ~ 2,
                                  grepl("^inc",variable) ~ 3,
                                  grepl("rent_burden",variable) ~ 4,
                                  grepl("prop|med.rent",variable) ~ 5,
                                  grepl("cm1age",variable) ~ 6,
                                  grepl("impulsivity|cm3cogsc",variable) ~ 7),
                        labels = c("Child age",
                                   "Interview date",
                                   "Income / poverty",
                                   "Rent / income",
                                   "Neighborhood characteristics",
                                   "Mother's age (birth)",
                                   "Mother's scores")),
         variable = paste0(gsub("inc|rent_burden|^age|date|med[.]rent[.]over[.]income","Wave ",variable),
                           "\nN = ",num),
         variable = gsub("cm1age","",variable),
         variable = gsub("cm3cogsc","WAIS-R cognitive score",variable),
         variable = gsub("impulsivity","Impulsivity (Dickman)",variable),
         variable = gsub("prop.all.other","Proportion other race",variable),
         variable = gsub("prop.black","Proportion black",variable),
         variable = gsub("prop.hisp","Proportion Hispanic",variable),
         variable = gsub("prop.below.povline","Prop. below poverty line",variable)) %>%
  ggplot(aes(x = value, fill = type)) +
  geom_density(alpha = .4) +
  ylab("Density") +
  xlab("True and imputed values for observed cases of continuous variables") +
  theme(legend.title = element_blank()) +
  facet_wrap(~group + variable, scales = "free", ncol = 5) +
  theme(text = element_text(size = 8)) +
  ggsave("Output/amelia_diagnostics_cts.pdf",
         height = 6.5, width = 9)

over_bin %>%
  melt(id = c("variable","num"),
       variable.name = "type") %>%
  mutate(group = factor(case_when(variable == "ev2" ~ 1,
                                  variable == "ev3" ~ 2,
                                  variable == "ev4" ~ 3,
                                  variable == "ev5" ~ 4,
                                  variable == "ev6" ~ 5,
                                  variable == "own2" ~ 6,
                                  variable == "own3" ~ 7,
                                  variable == "own4" ~ 8,
                                  variable == "own5" ~ 9,
                                  variable == "own6" ~ 10,
                                  variable == "married" ~ 11,
                                  variable == "m1imm" ~ 12,
                                  variable == "cmf2fevjail" ~ 13),
                        labels = c("Evicted\nWave 2","Evicted\nWave 3","Evicted\nWave 4","Evicted\nWave 5","Evicted\nWave 6",
                                   "Lives in owned home\nWave 2","Lives in owned home\nWave 3","Lives in owned home\nWave 4",
                                   "Lives in owned home\nWave 5","Lives in owned home\nWave 6",
                                   "Parents married\nat birth",
                                   "Mother foreign-born",
                                   "Father ever in jail\nby child age 1")),
         variable = paste0("N = ",num)) %>%
  ggplot(aes(x = variable, y = value, fill = type, color = type,
             label = format(value, digits = 2, format = "f"))) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_label(aes(y = value + .05, fill = NULL),
             position = position_dodge(width = 1)) +
  ylab("Proportion") +
  xlab("True and imputed mean for observed cases of binary variables") +
  scale_x_discrete(breaks = element_blank()) +
  ylim(c(0, .5)) +
  theme(legend.title = element_blank()) +
  facet_wrap(~group + variable, scales = "free_x", ncol = 5) +
  ggsave("Output/amelia_diagnostics_bin.pdf",
         height = 6.5, width = 9)

## Check the proportion of the variance that is between imputations
## Posterior variance within and between imputations (Rubin's rules)
sink("Output/mi_variance_decomp.txt", append = F, split = F)
print(
  model_estimates %>%
    filter(group == "overall.Overall") %>%
    group_by(imputation) %>%
    summarize(within.variance = var(estimate),
              imputation.mean = mean(estimate)) %>%
    group_by() %>%
    summarize(within = mean(within.variance),
              between = (1 + 1 / 30) * var(imputation.mean)) %>%
    mutate(total = within + between,
           prop.between = between / total)
)

## Compare to the raw estimate of the total variance by simulation
print("Raw estimate of total posterior variance")
print(
  model_estimates %>%
    filter(group == "overall.Overall") %>%
    summarize(total = var(estimate))
)
sink()

#####################################
## Figure A3. Summarize the priors ##
#####################################

data.frame(
  Intercept = seq(-7.5,-1.5,.01)
) %>%
  mutate(PriorDensity = dcauchy(Intercept,-4.5,1)) %>%
  ggplot(aes(x = Intercept,
             y = PriorDensity,
             ymin = 0,
             ymax = PriorDensity)) +
  ylab("Prior density") +
  xlab("Intercept\nCauchy(-4.5, 1)") +
  scale_x_continuous(breaks = c(-6.5,-4.5,-2.5)) +
  geom_ribbon(alpha = .4, fill = "blue") +
  ggsave("Output/priorIntercept.pdf",
         height = 2, width = 4)

data.frame(
  p = seq(.001,.15,.001)
) %>%
  mutate(PriorDensity = dcauchy(qlogis(p),-4.5,1)) %>%
  ggplot(aes(x = p,
             y = PriorDensity,
             ymin = 0,
             ymax = PriorDensity)) +
  xlab("Annual probability of eviction\nfor a child with average characteristics") +
  ylab("Prior density") +
  geom_ribbon(alpha = .4, fill = "blue") +
  ggsave("Output/priorInterceptP.pdf",
         height = 2, width = 4)

data.frame(
  Slope = seq(-3,3,.01)
) %>%
  mutate(PriorDensity = dcauchy(Slope,0,1)) %>%
  ggplot(aes(x = Slope,
             y = PriorDensity,
             ymin = 0,
             ymax = PriorDensity)) +
  xlab("Slope\nCauchy(0, 1)") +
  ylab("Prior density") +
  geom_ribbon(alpha = .4, fill = "blue") +
  ggsave("Output/priorSlope.pdf",
         height = 2, width = 4)

data.frame(
  OR = seq(.05,10,.05)
) %>%
  mutate(PriorDensity = dcauchy(log(OR),0,1)) %>%
  ggplot(aes(x = OR,
             y = PriorDensity,
             ymin = 0,
             ymax = PriorDensity)) +
  scale_x_continuous(breaks = 0:10) +
  xlab("Factor by which odds of eviction change\nwith 2 SD increase in a continuous predictor\nor a unit increase in a binary predictor") +
  ylab("Prior density") +
  geom_ribbon(alpha = .4, fill = "blue") +
  ggsave("Output/priorSlopeOR.pdf",
         height = 2, width = 4)

data.frame(
  StandardDeviation = seq(.01,4,.01)
) %>%
  mutate(PriorDensity = 2*dcauchy(StandardDeviation,0,1)) %>%
  ggplot(aes(x = StandardDeviation,
             y = PriorDensity,
             ymin = 0,
             ymax = PriorDensity)) +
  xlab("Standard deviation of\nchild and city random intercepts\nHalf-Cauchy(0, 1)") +
  ylab("Prior density") +
  geom_ribbon(alpha = .4, fill = "blue") +
  ggsave("Output/REprior.pdf",
         height = 2, width = 4)

data.frame(
  OR = seq(1,10,.1)
) %>%
  mutate(PriorDensity = 2*dcauchy(log(OR),0,1)) %>%
  ggplot(aes(x = OR,
             y = PriorDensity,
             ymin = 0,
             ymax = PriorDensity)) +
  xlab("Factor by which odds of eviction increase\nwith 1 SD increase in child or city random effects") +
  ylab("Prior density") +
  scale_x_continuous(breaks = 0:10) +
  geom_ribbon(alpha = .4, fill = "blue") +
  ggsave("Output/REpriorOR.pdf",
         height = 2, width = 4)

############################
## Figure A4. Trace plots ##
############################

## Sigma draws
cl <- makeCluster(15)
registerDoParallel(cl)
sigma.draws <- foreach(i = 1:n.imps, .combine = "rbind", .packages = "tidyverse") %dopar% {
  read_csv(paste0("Intermediate/stan_parameters/stan_reTRUE_model6_imp_",i,".csv"),
           comment = "#") %>%
    select_at(vars(contains("sigma")))
}
stopCluster(cl)

model_estimates %>%
  filter(group == "overall.Overall") %>%
  mutate(overall = estimate) %>%
  select(-group) %>%
  bind_cols(sigma.draws) %>%
  melt(id = c("draw","imputation")) %>%
  group_by(variable) %>%
  arrange(variable, imputation, draw) %>%
  mutate(Draw = 1:n()) %>%
  group_by() %>%
  mutate(variable = ifelse(variable == "sigma_city","A. SD of city random intercepts",
                           ifelse(variable == "sigma_person","B. SD of child random intercepts",
                                  "C. Proportion evicted by age 15\n(weighted)"))) %>%
  ggplot(aes(x = Draw, y = value, color = factor(imputation))) +
  geom_line(size = 0.05) +
  facet_wrap(~variable, ncol = 2, scales = "free") +
  xlab("Draw number (colors indicate separate imputation chains)") +
  ylab("Posterior draw") +
  theme(legend.position = "none") +
  scale_color_manual(values = rep(c("blue","seagreen4"), ceiling(n.imps / 2))) +
  ggsave("Output/traceSmall.pdf",
         height = 4, width = 8)

#####################################
## Table A3. Coefficient estimates ##
#####################################

## First identify the scale factors, including for interaction terms
scaleFactorsExtended <- c(scale.factors,
                          ## Append scale factor of 1 for interaction
                          ## terms since these are just the interaction
                          ## of dummy variables and thus were not rescaled
                          rep(1,4))
names(scaleFactorsExtended)[(length(scale.factors) + 1):length(scaleFactorsExtended)] <-
  c("married:IncBelow50PctPovLine","married:Inc50.100PctPovLine",
    "married:Inc100.200PctPovLine","married:Inc200.300PctPovLine")

cl <- makeCluster(15)
registerDoParallel(cl)
coefficient.draws <- foreach(i = 1:n.imps, .combine = "rbind", .packages = c("tidyverse","reshape2")) %dopar% {
  read_csv(paste0("Intermediate/stan_parameters/stan_reTRUE_model6_imp_",i,".csv"),
           comment = "#") %>%
    select_at(vars(contains("beta"))) %>%
    melt(id = NULL) %>%
    mutate(variable = as.numeric(str_replace(variable,"beta.",""))) %>%
    mutate(value = value / scaleFactorsExtended[variable],
           oddsRatio = exp(value),
           variable = factor(variable,
                             labels = names(scaleFactorsExtended)))
}
stopCluster(cl)
table_a3 <- coefficient.draws %>%
  group_by(variable) %>%
  summarize(PMean = mean(oddsRatio),
            CI.min = quantile(oddsRatio,.025),
            CI.max = quantile(oddsRatio,.975)) %>%
  bind_rows(
    sigma.draws %>%
      melt(id = NULL) %>%
      group_by(variable) %>%
      summarize(PMean = mean(exp(value)),
                CI.min = quantile(exp(value),.025),
                CI.max = quantile(exp(value),.975))
  )

write.csv(table_a3, file = "Output/table_a3.csv")

##############################
## Numbers reported in text ##
##############################

sink("Output/text_notes.txt", append = F, split = F)
load("Intermediate/merged.Rdata")

## Ownership in prior wave for parents reporting eviction
print("Ownership in prior wave for parents reporting eviction")
print(
  merged %>%
    transmute(
      idnum = idnum,
      m1own = case_when(m1f2 == 1 ~ T,
                        m1f2 == 0 ~ F),
      m2own = case_when(m2h2 %in% (4:5) ~ T,
                        m2h2 > 0 ~ F),
      m3own = case_when(m3i2 %in% (4:5) ~ T,
                        m3i2 > 0 ~ F),
      m4own = case_when(m4i1 != 1 ~ m3own,
                        m4i2 %in% (4:5) ~ T,
                        m4i2 > 0 ~ F),
      m5own = case_when(m5f1 != 1 ~ m4own,
                        m5f2 %in% (4:5) ~ T,
                        m5f2 > 0 ~ F),
      f1own = case_when(f1f2 == 1 ~ T,
                        f1f2 == 0 ~ F),
      f2own = case_when(f2h2 %in% (4:5) ~ T,
                        f2h2 > 0 ~ F),
      f3own = case_when(f3i2 %in% (4:5) ~ T,
                        f3i2 > 0 ~ F),
      f4own = case_when(f4i1 != 1 ~ f3own,
                        f4i2 %in% (4:5) ~ T,
                        f4i2 > 0 ~ F),
      f5own = case_when(f5f1 != 1 ~ f4own,
                        f5f2 %in% (4:5) ~ T,
                        f5f2 > 0 ~ F),
      m6ev = case_when(p6j40 >= 0 & cp6pcgrel == 1 ~ p6j40 == 1),
      f6ev = case_when(p6j40 >= 0 & cp6pcgrel == 2 ~ p6j40 == 1)
    ) %>%
    left_join(d %>%
                select(idnum, ends_with("ev")),
              by = "idnum") %>%
    rename_at(vars(ends_with("own")), function(x) {
      number <- as.numeric(str_replace_all(x,"[[:alpha:]]",""))
      respondent <- substr(x,1,1)
      paste0(respondent,".own.", number + 1)
    }) %>%
    rename_at(vars(ends_with("ev")), function(x) {
      quantity <- case_when(grepl("^m",x) ~ "m.ev.",
                            grepl("^f",x) ~ "f.ev.")
      number <- str_replace_all(x,"[[:alpha:]]","")
      return(paste0(quantity,number))
    }) %>%
    melt(id = "idnum") %>%
    separate(variable, into = c("reporter","variable","wave")) %>%
    spread(key = "variable", value = "value") %>%
    filter(ev == T) %>%
    summarize(prop_nonMissing_own = mean(own, na.rm = T),
              prop_missing = mean(is.na(own)))
)

## Years covered by lower bound estimator
## Note how many years are covered
print("Mean # annual reports")
weighted.mean((as.numeric(!is.na(d$ev2)) + 
                 as.numeric(!is.na(d$ev3)) + 
                 as.numeric(!is.na(d$ev4)) + 
                 as.numeric(!is.na(d$ev5)) + 
                 as.numeric(!is.na(d$ev6)))[!is.na(d$m1natwt)],
              w = na.omit(d$m1natwt))
print("Proportion with retrospective report")
print(weighted.mean(!is.na(d$ev6long)[!is.na(d$m1natwt)],
                    w = na.omit(d$m1natwt)))

## Check alpha of impulsivity scale
print("Alpha of impulsivity scale")
print(psych::alpha(na.omit(data.frame(
  ifelse(merged$m3j44a >= 0, merged$m3j44a, NA),
  ifelse(merged$m3j44b >= 0, merged$m3j44b, NA),
  ifelse(merged$m3j44c >= 0, merged$m3j44c, NA),
  ifelse(merged$m3j44d >= 0, merged$m3j44d, NA),
  ifelse(merged$m3j44e >= 0, merged$m3j44e, NA),
  ifelse(merged$m3j44f >= 0, merged$m3j44f, NA)
))))
rm(merged)

## For Online Supplement Part 3, note observed person-years evicted
## and implied 15-year prevalence assuming person-years are iid draws
print("Prop. of person-years evicted and implied estimate if IID")
print(
  d %>%
    select(idnum,ev2,ev3,ev4,ev5,ev6) %>%
    melt(id = "idnum") %>%
    filter(!is.na(value)) %>%
    summarize(prob_evicted = mean(value)) %>%
    mutate(cum_prob = 1 - (1 - prob_evicted) ^ 15)
)
sink()

## Store modeled estimates to report in text
## This gives numeric values to the ends of CIs
sink("Output/prevalence_modeled.txt", append = FALSE, split = FALSE)
print("By demographic groups")
print(data.frame(estimates))
sink()

##################################
## FOOTNOTE: Multiple evictions ##
##################################

## This is moderately memory-intensive,
## so removing unnecessary objects first
rm(list = ls())

load("Intermediate/d.Rdata")
load("Intermediate/imputed.cumulative.estimates.Rdata")
load("Intermediate/specification_numbers.Rdata")
load("Intermediate/newData.Rdata")
load("Intermediate/IDs.Rdata")

sink("Output/multiple.txt", append = FALSE, split = FALSE)
## Firm lower bound
print("Firm lower bound on multiple evictions")
print(weighted.mean(apply(d[!is.na(d$m1natwt),
                            c("ev2","ev3","ev4","ev5","ev6long")],
                          1, function(x) sum(na.omit(x)) > 1),
                    w = na.omit(d$m1natwt)))
print("95% CI for firm lower bound on multiple evictions")
cl <- makeCluster(10)
registerDoParallel(cl)
set.seed(08544)
print(
  quantile(foreach (i = 1:bs.samps, .combine = "c", .packages = "tidyverse") %dorng% {
    samp <- d %>%
      filter(!is.na(m1natwt)) %>%
      sample_frac(weight = m1natwt, replace = T) %>%
      select(ev2,ev3,ev4,ev5,ev6long)
    mean(apply(samp, 1, function(x) sum(na.omit(x)) > 1))
  }, c(.025, .975))
)
stopCluster(cl)
## Multiple imputation
print("MI estimate for multiple evictions")
print(mean(sapply(imputed.cumulative.estimates$multiple.ev.retro$imputations,
                  function(imp) weighted.mean(imp$multiple.ev.retro[!is.na(d$m1natwt)],
                                              w = imp$m1natwt[!is.na(d$m1natwt)]))))
print("95% CI for firm MI estimate of multiple evictions")
cl <- makeCluster(10)
registerDoParallel(cl)
set.seed(08544)
print(
  quantile(foreach (i = 1:bs.samps, .combine = "c", .packages = "tidyverse") %dorng% {
    imp <- (i - 1) %% n.imps + 1
    samp <- imputed.cumulative.estimates$multiple.ev.retro$imputations[[imp]] %>%
      filter(!is.na(m1natwt)) %>%
      sample_frac(weight = m1natwt, replace = T) %>%
      select(multiple.ev.retro)
    mean(samp$multiple.ev.retro)
  }, c(.025, .975))
)
stopCluster(cl)
## Modeled
cl <- makeCluster(5, outfile = "")
registerDoParallel(cl)
p_multiple <- foreach(i = 1:n.imps, .combine = "rbind", .packages = c("tidyverse","reshape2")) %dopar% {
  beta <- read_csv(paste0("Intermediate/stan_parameters/stan_reTRUE_model6_imp_",i,".csv"),
                   comment = "#") %>%
    select_at(vars(contains("beta")))
  alpha <-  read_csv(paste0("Intermediate/stan_parameters/stan_reTRUE_model6_imp_",i,".csv"),
                     comment = "#") %>%
    select_at(vars(contains("alpha")))
  u_city <-  read_csv(paste0("Intermediate/stan_parameters/stan_reTRUE_model6_imp_",i,".csv"),
                      comment = "#") %>%
    select_at(vars(contains("u_city")))
  u_person <-  read_csv(paste0("Intermediate/stan_parameters/stan_reTRUE_model6_imp_",i,".csv"),
                        comment = "#") %>%
    select_at(vars(contains("u_person")))
  X <- newData[[6]][[i]]$X
  city <- newData[[6]][[i]]$city
  person <- newData[[6]][[i]]$person
  
  ## Combine to make the probability of eviction
  p <- plogis(X %*% t(beta) + 
                t(replicate(nrow(X),alpha$alpha)) +
                t(u_city[,city]) +
                t(u_person[,person]))
  
  ## Start with the cumulative probability of 0 evictions
  cum.prob.0 <- data.frame(person = person,
                           p) %>%
    group_by(person) %>%
    summarize_all(.funs = function(x) prod(1 - x)) %>%
    group_by() %>%
    melt(id = "person",
         variable.name = "draw",
         value.name = "cum.prob.0")
  ## Then the probability of exactly 1 eviction
  cum.prob.1 <- data.frame(person = person,
                           p) %>%
    group_by(person) %>%
    summarize_all(.funs = function(x) {
      sum(sapply(1:15, function(yrEvicted) {
        x[yrEvicted]*prod(1 - x[-yrEvicted])
      }))
    }) %>%
    group_by() %>%
    melt(id = "person",
         variable.name = "draw",
         value.name = "cum.prob.1")
  
  ## The probability of multiple evictions is what is left
  cum.prob.multiple <- cum.prob.1 %>%
    left_join(cum.prob.0,
              by = c("person","draw")) %>%
    mutate(cum.prob.multiple = 1 - cum.prob.0 - cum.prob.1) %>%
    left_join(IDs, by = "person") %>%
    left_join(d %>%
                select(idnum, m1natwt),
              by = "idnum") %>%
    filter(!is.na(m1natwt)) %>%
    group_by(draw) %>%
    summarize(prevalence.multiple = weighted.mean(cum.prob.multiple,
                                                  w = m1natwt))
  return(cum.prob.multiple)
}
stopCluster(cl)

print("Modeled estimate for multiple evictions")
print(data.frame(
  p_multiple %>%
    summarize(estimate.mean = mean(prevalence.multiple),
              estimate.min = quantile(prevalence.multiple, .025),
              estimate.max = quantile(prevalence.multiple, .975))
))

sink()

##########################################
## Numbers reported in text of appendix ##
##########################################

## Note some statistics to build intuition about the priors
## These are in the appendix to the paper

sink("Output/prior_intuition.txt", append = FALSE, split = FALSE)
## Prior density on average child eviction risk
print("Middle 50% of prior on average child eviction risk")
print(plogis(qcauchy(c(.25,.75), -4.5, 1)))
print("Middle 90% of prior on average child eviction risk")
print(plogis(qcauchy(c(.05,.95), -4.5, 1)))
## Prior density on odds ratios
print("Middle 50% of prior on odds ratios")
print(exp(qcauchy(c(.25, .75), 0, 1)))
print("Middle 90% of prior on odds ratios")
print(exp(qcauchy(c(.05, .95), 0, 1)))
## Prior on SD of random intercepts
print("Lower 75% of prior on sd of random intercepts")
print(exp(qcauchy(.75, 0, 1)))
print("Lower 95% of prior on sd of random intercepts")
print(exp(qcauchy(.95, 0, 1)))
sink()

## Test the city differences net of everything
cl <- makeCluster(15)
registerDoParallel(cl)
city_diffs <- foreach(i = 1:n.imps, .combine = "rbind", .packages = "tidyverse") %dopar% {
  read_csv(paste0("Intermediate/stan_parameters/stan_reTRUE_model6_imp_",i,".csv"),
           comment = "#") %>%
    select_at(vars(contains("u_city")))
}
stopCluster(cl)
sink("Output/city_u.txt", append = FALSE, split = FALSE)
print(sort(colMeans(city_diffs > 0)))
sink()
